library(ggplot2)
library(reshape2)
library(dplyr)
# Set seed
set.seed(1996)
#############################
rm(list=setdiff(ls(), c('script', 'scripts', 'log_file')))
#############################
# Load Data
master <- readRDS('master_web.rds')

#############################
# Percent of Races -- page 2
#############################
# Finding percent of all partisan primaries with incumbent and a challenger
race <- master %>%
  subset(year <= 2020 & # Subset to before 2020
         (state_postal != 'CA' & 
            state_postal != 'WA' &
            state_postal != 'LA')) %>% # Remove non-partisan primary states
  group_by(raceid, candnumber, state_postal, cd, year) %>%
  summarise(inc_race = sum(inc))

# Creating Binary for if incumbent challenged in race
race$inc_challenged <- ifelse(race$inc_race >= 1 & race$candnumber > 1, 1, 0)
# Calculating mean proportion of races
mean(race$inc_challenged)

# Merge with Master
master <- merge(master, race, by = c('raceid', 
                                     'candnumber',
                                     'state_postal', 
                                     'cd', 'year'), 
                all.x = TRUE)

# Finding percent of all districts with incumbent and a challenger
race2 <- race %>%
  group_by(state_postal, cd, year) %>%
  summarise(race_incumbent = sum(inc_challenged))
# Finding mean proportion of districts
mean(race2$race_incumbent)

#############################
# Coverage Numbers -- page 11 and 12
#############################
# Percent of candidates with website
table(master$website)
prop.table(table(master$website))

# Candidates with an issue page
table(master$issue_page)
table(master$issue_page)/nrow(master)

# Percent of candidates with issue page conditional has website
table(master$issue_page)/nrow(subset(master, website == 'Yes'))

# Coverage of CFScores
nrow(subset(master, !is.na(cfscore)))
nrow(subset(master, !is.na(cfscore)))/nrow(master)

# Comparison with coverage: Inexperienced Candidates (WEB Scores)
nrow(subset(master, !is.na(web_score) & quality_cand == 0))
nrow(subset(master, !is.na(web_score) & quality_cand == 0))/nrow(subset(master, quality_cand == 0))

# Comparison with coverage: Incumbents (WEB Scores)
nrow(subset(master, !is.na(web_score) & quality_cand == 2))
nrow(subset(master, !is.na(web_score) & quality_cand == 2))/nrow(subset(master, quality_cand == 2))

# Comparison with coverage: Experienced Candidates (WEB Scores)
nrow(subset(master, !is.na(web_score) & quality_cand == 1))
nrow(subset(master, !is.na(web_score) & quality_cand == 1))/nrow(subset(master, quality_cand == 1))

# Comparison with coverage: Inexperienced Candidates (CFScores)
nrow(subset(master, !is.na(cfscore) & quality_cand == 0))
nrow(subset(master, !is.na(cfscore) & quality_cand == 0))/nrow(subset(master, quality_cand == 0))

# Comparison with coverage: Incumbents (CFScores)
nrow(subset(master, !is.na(cfscore) & quality_cand == 2))
nrow(subset(master, !is.na(cfscore) & quality_cand == 2))/nrow(subset(master, quality_cand == 2))

# Comparison with coverage: Experienced Candidates (CFScores)
nrow(subset(master, !is.na(cfscore) & quality_cand == 1))
nrow(subset(master, !is.na(cfscore) & quality_cand == 1))/nrow(subset(master, quality_cand == 1))

# Candidates without a CFscore
nrow(subset(master, is.na(cfscore)))
nrow(subset(master, is.na(cfscore) & !is.na(web_score)))
nrow(subset(master, is.na(cfscore) & !is.na(web_score)))/nrow(subset(master, is.na(cfscore)))

# Challenged an Incumbent (page 9) WEB Score versus CF Score
nrow(subset(master, is.na(cfscore) & quality_cand != 2 & inc_race == 1 & year <= 2020))/
  nrow(subset(master, quality_cand != 2 & inc_race == 1 & year <= 2020))


nrow(subset(master, is.na(web_score) & quality_cand != 2 & inc_race == 1 & year <= 2020))/
  nrow(subset(master, quality_cand != 2 & inc_race == 1 & year <= 2020))


#############################
# Figure 1 - Coverage by Candidate Type
#############################
# Create binary for if has web score and if has cfscore
figure1 <- master
figure1$web_valid <- ifelse(is.na(figure1$web_score), 0, 1)
figure1$cfscore_valid <- ifelse(is.na(figure1$cfscore), 0, 1)

# Creating Written Quality Variable
figure1$quality <- NA
figure1$quality[figure1$quality_cand == 2] <- 'Incumbent'
figure1$quality[figure1$quality_cand == 1]  <- 'Experienced Challenger'
figure1$quality[figure1$quality_cand == 0]  <- 'Inexperienced Challenger'

# Subset to only needed variables
figure1 <- subset(figure1, select = c('quality', 'cfscore_valid', 'web_valid'))

# Create Plot Data Frame
figure1.plot <- figure1 %>% group_by(quality) %>% 
  summarise(`CFscores`=sum(cfscore_valid),
            `Campaign Websites` = sum(web_valid))

# Remove rows with missing quality
figure1.plot <- subset(figure1.plot, !is.na(quality))
# Melt into long format
figure1.plotdf <- melt(figure1.plot, id = 'quality')

# Plot
ggplot(figure1.plotdf, aes(fill=variable, y=value, x=quality)) + 
  geom_bar(position="dodge", stat="identity") +
  theme_bw() +
  scale_fill_manual(values = c('gray1', 'gray55'),
                    name = 'Data') +
  xlab('\nCandidate Type') +
  ylab('Number of Candidates\n') + 
  theme(axis.text = element_text(size = 20),
        axis.title = element_text(size = 25),
        legend.title = element_text(size = 25),
        legend.text = element_text(size = 20),
        legend.position = 'bottom')

# Save figure
ggsave('fg1.tiff', width = 13, height = 10, units = 'in')


